## Example code for running the Nielsen Stemmer for Arabic
## Rich Nielsen
## nielsen.rich@gmail.com

## If you use this stemmer, please cite:
## Nielsen, Richard A. 2017. Deadly Clerics: Blocked Ambition and the Paths to Jihad. Cambridge University Press.

## install the stemmer
install.packages("arabicStemR")

## load the package
library(arabicStemR)

## bring in some text (I'm grabbing the current front page of Aljazeera)
dat <- paste(readLines("http://aljazeera.net/portal",encoding="UTF-8"), collapse=" ")
## remove the html
dat <- gsub("<.*?>", "",dat)
## stem and transliterate the results
stem(dat)

## stem and return the stemlist
out <- stem(dat,returnStemList=T)
out$text
out$stemlist
## This allows you to see which words are being combined
## Interpret this as follows:
i <- 1 
## This is the i'th stem in quotes (with the original word as the label)
out$stemlist[i]  
## These are all the words that resolve to the same stem.
names(out$stemlist)[out$stemlist==out$stemlist[i]] 
## And this will provide a count.  
mytab <- table(names(out$stemlist)[out$stemlist==out$stemlist[i]])
for(i in 1:length(mytab)){print(mytab[i])}
## Note that if you just look at "mytab", it will appear incorrect because
## R displays the Arabic labels from right to left but the numbers from left
## to right (thanks R!).

## This can be done for all of the stems
result <- sapply(out$stemlist, function(x){table(names(out$stemlist)[out$stemlist==x])})
for(i in 1:length(result)){
  cat(paste("stemmed:",out$stemlist[i],"\n"))
  cat("unstemmed:")
  print(result[[i]])
  cat("\n")
}
## display the results correctly for the i'th stem
i <- 1
for(j in 1:length(result[[i]])){print(result[[i]][j])}



## Some options can be left out with arguments: cleaning Arabic characters, Latin characters, and transliteration:
## This version keeps Latin characters and does not transliterate, but does stem the Arabic.
stem(dat, cleanChars=F, cleanLatinChars=F, transliteration=F)

## The stem function is just a wrapper for the sub-functions.
stem
## This means that you can create your own custom stemming easily
## For example, if you want to not clean up diacritics:

stemCustom <- function(dat, cleanChars=TRUE, cleanLatinChars=TRUE, 
                 transliteration=TRUE, returnStemList=FALSE){
    dat <- removeNewlineChars(dat)  ## gets rid of \n\r\t\f\v
    dat <- removePunctuation(dat)  ## gets rid of punctuation
####dat <- removeDiacritics(dat)  ## gets rid of Arabic diacritics
    dat <- removeEnglishNumbers(dat)  ## gets rid of English numbers
    dat <- removeArabicNumbers(dat)  ## gets rid of Arabic numbers
    dat <- removeFarsiNumbers(dat)  ## gets rid of Farsi numbers
    dat <- fixAlifs(dat)  ## standardizes different hamzas on alif seats
    if(cleanChars){dat <- cleanChars(dat)}  ## removes all unicode chars except Latin chars and Arabic alphabet
    if(cleanLatinChars){dat <- cleanLatinChars(dat)}  ## removes all Latin chars
    dat <- removeStopWords(dat)$text  ## removes the stopwords
    if(returnStemList==TRUE){
        tmp <- doStemming(dat) ## removes prefixes and suffixes, and can return a list matching words to stemmed words
        dat <- tmp$text
        stemlist <- tmp$stemmedWords
        if(transliteration){dat <- transliterate(dat)}  ## performs transliteration
        return(list(text=dat,stemlist=stemlist))
    } else {
        dat <- removePrefixes(dat)  ## removes prefixes
        dat <- removeSuffixes(dat)  ## removes suffixes
        if(transliteration){dat <- transliterate(dat)}  ## performs transliteration
        return(dat)
    }
}

## Some Arabic with diacritics
x <- '\u0627\u0647\u0644\u0627\u064b \u0648\u0633\u0644\u0627\u064b'
print(x)
## Stem it, retaining diacritics
stemCustom(x, cleanChars=F, transliteration=F)
## Note that cleanChars must be set to FALSE for diacritics to be retained 
## because cleanChars removes Arabic Diacritics as well.


## to see what stopwords are removed
removeStopWords("a")$arabicStopwordList